# LOAD
import os
import re
import sqlite3
import datetime
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn import cross_validation 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier

database = 'replication_20160118.sqlite'
conn = sqlite3.connect(database)

def get_data(sample, predictors, variables):
    wdi = pd.read_sql('SELECT * FROM wdi', conn)
    guo = pd.read_sql('SELECT * FROM guo', conn)
    sample = pd.read_sql('SELECT * FROM ' + sample, conn)
    distance = pd.read_sql('SELECT * FROM distance', conn)
    political = pd.read_sql('SELECT * FROM predictors_' + predictors, conn)
    dat = pd.merge(sample, guo, how='left')
    dat = pd.merge(dat, distance, how='left')
    dat = pd.merge(dat, wdi, left_on='iso2c_sub', right_on='iso2c')
    dat = pd.merge(dat, political, left_on='iso2c_sub', right_on='iso2c')
    variables = variables + political.columns.tolist()[1:]
    variables = '|'.join(variables)
    cols = [x for x in dat.columns if re.search(variables, x) != None]
    dat = dat[cols]
    dat = dat.set_index(['iso2c_guo', 'id_guo', 'iso2c_sub']).sort_index().reset_index()
    return dat

def get_arrays(dat):
    dat = dat.dropna()
    # variables
    political = pd.read_csv('codebook.csv')
    political = political.variable[political.political==1].tolist()
    political = '|'.join(political)
    # arrays
    y = dat.sub_dummy_host.as_matrix()
    X = dat.iloc[:, 4:]
    X_apol = X[[x for x in X.columns if re.search(political, x) == None]]
    variables_pol = X.columns.tolist()
    variables_apol = X_apol.columns.tolist()
    X_pol = X.as_matrix()
    X_apol = X_apol.as_matrix()
    arrays = cross_validation.train_test_split(y, X_pol, X_apol, train_size=.5, random_state=1024)
    labels = ['y_train', 'y_test', 'X_pol_train',
              'X_pol_test', 'X_apol_train', 'X_apol_test']
    out = dict(zip(labels, arrays))
    out['variables_pol'] = variables_pol
    out['variables_apol'] = variables_apol
    return out

def estimate(arrays, cls):
    cls.random_state = 1024
    cls.fit(arrays['X_apol_train'], arrays['y_train'])
    score_apol = cls.score(arrays['X_apol_test'], arrays['y_test'])
    cls.fit(arrays['X_pol_train'], arrays['y_train'])
    score_pol = cls.score(arrays['X_pol_test'], arrays['y_test'])
    results = pd.DataFrame([score_apol, score_pol]).T
    results.columns = ['score_apol', 'score_pol']
    return results

class Logistic():
    def fit(self, X, y):
        self.cls = sm.Logit(y, X).fit()
    def score(self, X, y):
        yhat = np.where(self.cls.predict(X) > .5, 1, 0)
        score = np.mean(yhat == y)
        return score

classifiers = {'Random Forest':RandomForestClassifier(n_estimators=50, random_state=1024),
               'Extremely Randomized Trees':ExtraTreesClassifier(n_estimators=50, random_state=1024),
               'Ada Boost':AdaBoostClassifier(random_state=1024),
               'Bagging':BaggingClassifier(random_state=1024),
               'Logistic':Logistic()
               }

tables = ['sample', 'sample_klarge', 'sample_hi', 'sample_lo', 'sample_havens',
          'sample_outliers', 'sample_strata']
variables = ['id_guo', 'iso2c.guo', 'iso2c.sub', 'iso_count', 'sub_dummy_host',
             'naics\d', 'distw', '^gdp$']
variables_distance = variables + ['economic_distance', 'administrative_distance']
variables_wdi = variables + ['gdppc', 'pop', 'pop_urban', 'telephone']
variables_nodistance = [x for x in variables if x != 'distw']

print('prepare data')
data = {'baseline' : get_data('sample', 'ksmall', variables)}
print('prepare arrays')
arrays = {k: get_arrays(data[k]) for k in data.keys()}

# print('estimate models')
# results = []
# for a in arrays.keys():
    # for c in classifiers.keys():
        # print(a, c)
        # tmp = estimate(arrays[a], classifiers[c])
        # tmp['sample'] = a
        # tmp['classifier'] = c
        # results.append(tmp)
# results = pd.concat(results)
# results.to_csv('tables/table_6_21_22.csv', index=False)

# # TUNING
# arr = get_arrays(data['baseline'])
# tuning = []
# for criterion in ['gini', 'entropy']:
    # for max_features in [5, 20, 'auto']:
        # for n_estimators in [10, 50, 100]:
            # for min_samples_leaf in [1, 50]:
                # print(criterion, max_features, n_estimators, min_samples_leaf)
                # cls = RandomForestClassifier(
                        # random_state=1024, criterion=criterion,
                        # max_features=max_features, n_estimators=n_estimators,
                        # min_samples_leaf=min_samples_leaf)
                # tmp = estimate(arr, cls).iloc[0,:].tolist()
                # tmp = [criterion, max_features, n_estimators, min_samples_leaf] + tmp
                # var = ['Criterion', 'Max features', 'N estimators', 'Minimum samples per leaf', 'Apolitical', 'Political']
                # tmp = pd.DataFrame(list(zip(var, tmp)))
                # tmp.columns = ['Setting', 'Value']
                # tmp = tmp.set_index('Setting')
                # tmp = tmp.T
                # tuning.append(tmp)
# k = pd.concat(tuning)
# k.to_csv('tables/table_23.csv', index=False)

# FIGURE 3
arr = get_arrays(data['baseline'])
cls = RandomForestClassifier(n_estimators=50, random_state=1024)
cls.fit(arr['X_pol_train'], arr['y_train'])
importance = pd.DataFrame([arr['variables_pol'],
                          np.ravel(cls.feature_importances_).tolist()]).T
importance.columns = ['Variable', 'Importance']
importance.to_csv('figure3.csv', index=False)

# LOG FILE
with open('firm.log', 'w') as f:
    f.write('Done!')
